# Clean ADL data 

# Load Libraries ----------------------

# For exact replication, run code on R 4.2.2 to use exact groundhog version
# library(groundhog)
# groundhog.library(tidyverse, "2023-01-01")
# groundhog.library(haven, "2023-01-01")
# groundhog.library(sjlabelled, "2023-01-01")
library(tidyverse)
library(haven)
library(sjlabelled)

# setwd("Where ever folder to replication files are/RP_NYZ_replication_files")

# Create output directory 
output_dir <- "output/data"
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}

# Clean 2013 ---------------------------
# Load Data 
dat2013 <- 
  read_dta("input/ADLraw/ADL 2013 Combined File.dta")

# Fix numerics
# Change to label
dat2013 <- 
  dat2013 %>%
  mutate_at(vars(respnumber, wave, id, date, 
                 yearborn, Age, schoolage, weight, First50GlobalShare, 
                 InternationalFirstWaveWeight, InternationalProportion,
                 InternationalTotalWeight, hdi, gdp), 
            as.numeric) %>% 
  mutate_at(vars(-respnumber, -wave, -id, -date,  
                 -yearborn, -Age, -schoolage, -weight, -First50GlobalShare, 
                 -InternationalFirstWaveWeight, -InternationalProportion,
                 -InternationalTotalWeight, -hdi, -gdp), 
            as_character)

# Country Names
dat2013 <- 
  dat2013 %>%
  mutate(
    country = toupper(TotalCountry)
  ) %>%
  select(-TotalCountry, -COUNTRY_II)

# Date
dat2013 <- 
  dat2013 %>% 
  mutate(
    date = as.character(date),
    year = case_when(
      country %in% c("AUSTRIA", "BELGIUM", "BOSNIA AND HERZEGOVINA",
                     "FINLAND", "ICELAND") ~ NA_real_, # these countries sample over 2013-2014
      country %in% c("PHILIPPINES") ~ 2014,
      country %in% c("HUNGARY", "GREECE", "ROMANIA", "SERBIA") ~ 
        paste0("20", str_sub(date, start = -2)) %>% as.numeric(),
      country %in% c("CROATIA", "LATVIA", "LIBYA", "LITHUANIA",
                     "MALAYSIA","MAURITIUS", "MOLDOVA", "MONGOLIA",
                     "MONTENEGRO", "OMAN", "PORTUGAL", "QATAR", 
                     "SINGAPORE", "UNITED ARAB EMIRATES", "VIETNAM") ~ 
        str_sub(date, start = -4) %>% as.numeric(),
      country %in% c("CHINA", "INDIA", "INDONESIA", "THAILAND") ~ 
        str_sub(date, end = 4) %>% as.numeric(),
      country == "USA" ~ paste0("20", str_sub(date, end = 2)) %>% as.numeric(),
      TRUE ~ 2013
    ),
    month = case_when(
      country %in% c("PHILIPPINES") ~ 1,
      country %in% c("ALGERIA", "COLOMBIA", "COSTA RICA", "DOMINICAN REPUBLIC",
                    "GUATEMALA", "HAITI", "MOLDOVA", "NICARAGUA",
                    "URUGUAY", "WEST BANK AND GAZA") ~ 11,
      country %in% c("CANADA", "RUSSIA", "TURKEY", "UKRAINE") ~ 7,
      country %in% c("AUSTRALIA", "AZERBAIJAN", "BRAZIL", "CHILE",
                     "EGYPT", "GEORGIA", "GHANA", "JAPAN", "KENYA",
                     "NIGERIA", "PANAMA", "PERU", "SENEGAL", "SERBIA",
                     "SOUTH AFRICA", "SOUTH KOREA", "VENEZUELA") ~ 8,
      country%in% c("BANGLADESH", "BULGARIA", "CZECH REPUBLIC", "COTE D'IVOIRE",
                    "ESTONIA", "MONTENEGRO", "PORTUGAL") ~ 12,
      country %in% c("HUNGARY", "ROMANIA", "SERBIA") ~ 
        str_sub(date, start = -6, end = -5) %>% as.numeric(),
      country == "GREECE" ~ str_sub(date, start = -4, end = -3) %>% as.numeric(),
      country %in% c("CROATIA", "LATVIA", "LIBYA", "LITHUANIA",
                     "MALAYSIA","MAURITIUS", "MOLDOVA", "MONGOLIA",
                     "MONTENEGRO", "OMAN", "PORTUGAL", "QATAR", 
                     "SINGAPORE", "UNITED ARAB EMIRATES", "VIETNAM") ~ 
        str_sub(date, end = -7) %>% as.numeric(),
      country %in% c("CHINA", "INDIA", "INDONESIA", "THAILAND") ~ 
        str_sub(date, start = 5, end = 6) %>% as.numeric(),
      country == "USA" ~ str_sub(date, start = 3, end = 4) %>% as.numeric(),
      TRUE ~ NA_real_
    ),
    day = case_when(
      country %in% c("HUNGARY", "GREECE", "ROMANIA", "SERBIA") ~ 
        str_sub(date, start = 1, end = 2) %>% as.numeric(),
      country %in% c("CROATIA", "LATVIA", "LIBYA", "LITHUANIA",
                     "MALAYSIA","MAURITIUS", "MOLDOVA", "MONGOLIA",
                     "MONTENEGRO", "OMAN", "PORTUGAL", "QATAR", 
                     "SINGAPORE", "UNITED ARAB EMIRATES", "VIETNAM") ~ 
        str_sub(date, start = -6, end = -5) %>% as.numeric(),
      country %in% c("CHINA", "INDIA", "INDONESIA",
                     "THAILAND", "USA") ~ 
        str_sub(date, start = -2, end = -1) %>% as.numeric(),
      TRUE ~ NA_real_
    )) %>% 
  select(-New_Age)

# Bin GDP 
dat2013 <- 
  dat2013 %>%
  mutate(
    gdp = case_when(
      gdp == 1 ~ "$50/day or more",
      gdp == 2 ~ "$10/day - $50/day",
      gdp == 3 ~ "Less than $10/day",
      TRUE ~ NA_character_
    )
  )

# Bin HDI 
dat2013 <- 
  dat2013 %>%
  mutate(
    hdi = case_when(
      hdi >= 1 & hdi <= 47 ~ "1-47",
      hdi >= 48 & hdi <= 141 ~ "48-141",
      hdi >= 142 & hdi <= 187 ~ "142-187",
      TRUE ~ NA_character_
    )
  )

# Consolidate ETHNICITY, LANG, GEO (some seem named differently)
dat2013 <- 
  dat2013 %>% 
  unite(ethnicity, matches("^(ETHNICITY_|ETHNICTY_)"), sep = "", na.rm = T) %>% 
  unite(language, starts_with("LANG_"), sep = "", na.rm = T)  %>%
  unite(geo, starts_with("GEO_"), sep = "", na.rm = T)  

# Consolidate other country-specific regions 
# AREATYPE_UKRAINE, , GEO2_NORWAY.
# Units of land size will be geo > geo2 > geo3
dat2013  <- 
  dat2013 %>% 
  mutate(geo = na_if(geo, "")) %>% 
  unite("geo", geo:REGION_FRANCE, sep = "", na.rm = T) %>% 
  mutate(
    geo2 = case_when(
      country == "ROMANIA" ~ GEO2_ROMANIA,
      country == "UKRAINE" ~ geo,
      country == "FINLAND" ~ GEO2_FINLAND,
      country == "ICELAND" ~ GEO2_ICELAND,
      country == "NORWAY" ~ GEO2_NORWAY,
      TRUE ~ NA_character_
    ),
    geo3 = case_when(
      country == "ROMANIA" ~ geo,
      TRUE ~ NA_character_
    ),
    geo1 = case_when(
      country == "ROMANIA" ~ GEO3_ROMANIA,
      country == "UKRAINE" ~ REGIONS_UKRAINE,
      TRUE ~ geo
    )
  ) %>% 
  select(-geo, -GEO2_ROMANIA, -GEO3_ROMANIA, -GEO2_FINLAND, -GEO2_ICELAND,
         -GEO2_NORWAY, -REGIONS_UKRAINE)

# Fix Region and Region II 
dat2013 <- 
  dat2013 %>%
  select(-Region) %>%
  rename(world_region = RegionII)

# Consolidate Q22 and 23 
dat2013 <- 
  dat2013 %>% 
  mutate(
    jewinteract = case_when(
      metjew == "No" ~ "Have never met a Jewish person",
      TRUE ~ jewinteract
    )
  ) %>%
  select(-metjew) 

# Delete columns INTACESS_*, make INTERNET_ columns more understandable
dat2013 <- 
  dat2013 %>%
  select(-starts_with("INTACCESS_")) %>%
  mutate(
    internet_home = case_when(
      INTERNET_1 == "Yes, home" ~ "yes",
      INTERNET_2 == "Yes, home" ~ "yes",
      INTERNET_3 == "Yes, home" ~ "yes",
      TRUE ~ NA_character_
    ),
    internet_phone = case_when(
      INTERNET_1 == "Yes, phone" ~ "yes",
      INTERNET_2 == "Yes, phone" ~ "yes",
      INTERNET_3 == "Yes, phone" ~ "yes",
      TRUE ~ NA_character_
    ),
    internet_job = case_when(
      INTERNET_1 == "Yes, job" ~ "yes",
      INTERNET_2 == "Yes, job" ~ "yes",
      INTERNET_3 == "Yes, job" ~ "yes",
      TRUE ~ NA_character_
    ),
    internet_none = case_when(
      INTERNET_1 == "No" & is.na(INTERNET_2) & is.na(INTERNET_3) &
        is.na(INTERNET_4) & is.na(INTERNET_5)  ~ "none",
      INTERNET_2 == "No" & is.na(INTERNET_1) & is.na(INTERNET_3) &
        is.na(INTERNET_4) & is.na(INTERNET_5)  ~ "none",
      INTERNET_3 == "No" & is.na(INTERNET_2) & is.na(INTERNET_1) &
        is.na(INTERNET_4) & is.na(INTERNET_5)  ~ "none",
      INTERNET_4 == "No" & is.na(INTERNET_2) & is.na(INTERNET_3) &
        is.na(INTERNET_1) & is.na(INTERNET_5)  ~ "none",
      TRUE ~ NA_character_
    ),
    internet_dk = case_when(
      INTERNET_1 == "'Don't Know/refused" & is.na(INTERNET_2) & 
        is.na(INTERNET_3) & is.na(INTERNET_4) & is.na(INTERNET_5)  ~ "dk",
      INTERNET_2 == "'Don't Know/refused" & is.na(INTERNET_1) & 
        is.na(INTERNET_3) & is.na(INTERNET_4) & is.na(INTERNET_5)  ~ "dk",
      INTERNET_3 == "'Don't Know/refused" & is.na(INTERNET_2) & 
        is.na(INTERNET_1) & is.na(INTERNET_4) & is.na(INTERNET_5)  ~ "dk",
      INTERNET_5 == "'Don't Know/refused" & is.na(INTERNET_2) & 
        is.na(INTERNET_3) & is.na(INTERNET_1) & is.na(INTERNET_5)  ~ "dk",
      TRUE ~ NA_character_
    )
  ) %>%
  select(-INTERNET_1, -INTERNET_2, -INTERNET_3, -INTERNET_4, -INTERNET_5,
         -starts_with("NewInternet_")) 

# Rearrange columns 
dat2013_clean <- 
  dat2013 %>% 
  select(respnumber, id, wave, date, year, month, day, 
         world_region, country, geo1, geo2, geo3, everything())


# Clean 2015 ---------------------------
# Load data 
dat2015 <- 
  read_dta("input/ADLraw/ADL 2015 Combined File.dta")

# Change to labels 
dat2015 <- 
  dat2015 %>%
  mutate_at(vars(respnumber, wave, id, 
                 yearborn, Age, schoolage, weight,  
                 InternationalProportion, InternationalTotalWeight),
            as.numeric) %>%
  mutate_at(vars(-respnumber, -wave, -id, 
                 -yearborn, -Age, -schoolage, -weight,  
                 -InternationalProportion, -InternationalTotalWeight), 
            as_character)

# Fix dates 
dat2015 <- 
  dat2015 %>% 
  mutate(
    wave = as.numeric(wave),
    year = 2015,
    month = NA_real_,
    day = NA_real_
  ) %>% 
  select(-New_Age)

# Country name
dat2015 <- 
  dat2015 %>% 
  mutate(
    country = toupper(TotalCountry)
  ) %>%
  select(-TotalCountry, -COUNTRY_II)

# Consolidate ETHNICITY, LANG, GEO (some seem named differently)
dat2015 <- 
  dat2015 %>% 
  unite(ethnicity, matches("^(ETHNICITY_|ETHNICTY_)"), sep = "", na.rm = T) %>% 
  unite(language, starts_with("LANG_"), sep = "", na.rm = T)  %>%
  mutate(
    geo1 = case_when(
      country == "USA" ~ GEO_USA,
      TRUE ~ NA_character_
    ),
    geo2 = case_when(
      country == "USA" ~ GEO_USA2,
      TRUE ~ NA_character_
    ),
    geo3 = case_when(
      country == "USA" ~ GEO_USA3,
      TRUE ~ NA_character_
    )
  ) %>%
  select(-GEO_USA, -GEO_USA2, -GEO_USA3, -GEO_USA4) %>% 
  unite(geo, starts_with("GEO_"), sep = "", na.rm = T) %>% 
  mutate(
    geo = na_if(geo, ""), 
    geo1 = case_when(
      country == "USA" ~ geo1,
      country == "ROMANIA" ~ GEO3_ROMANIA,
      TRUE ~ geo
    )
  ) %>% 
  select(-geo, -GEO3_ROMANIA)

# Consolidate NewEthnicity_Iran, NewEthnicity_Turkey
dat2015 <- 
  dat2015 %>%
  mutate(ethnicity = na_if(ethnicity, "")) %>% 
  unite(ethnicity, starts_with("NewEthnicity_"), sep = "", na.rm = T)

# Rename RegionII
dat2015 <- 
  dat2015 %>% 
  rename(world_region = RegionII)

# NewInternet_1
dat2015 <- 
  dat2015 %>% 
  mutate(
    internet_home = case_when(
      NewInternet_1 == "Yes, home" ~ "yes",
      NewInternet_2 == "Yes, home" ~ "yes",
      NewInternet_3 == "Yes, home" ~ "yes",
      TRUE ~ NA_character_
    ),
    internet_phone = case_when(
      NewInternet_1 == "Yes, phone" ~ "yes",
      NewInternet_2 == "Yes, phone" ~ "yes",
      NewInternet_3 == "Yes, phone" ~ "yes",
      TRUE ~ NA_character_
    ),
    internet_job = case_when(
      NewInternet_1 == "Yes, job" ~ "yes",
      NewInternet_2 == "Yes, job" ~ "yes",
      NewInternet_3 == "Yes, job" ~ "yes",
      TRUE ~ NA_character_
    ),
    internet_none = case_when(
      NewInternet_1 == "No" & is.na(NewInternet_2) & is.na(NewInternet_3) &
        is.na(NewInternet_4) & is.na(NewInternet_5)  ~ "none",
      NewInternet_2 == "No" & is.na(NewInternet_1) & is.na(NewInternet_3) &
        is.na(NewInternet_4) & is.na(NewInternet_5)  ~ "none",
      NewInternet_3 == "No" & is.na(NewInternet_2) & is.na(NewInternet_1) &
        is.na(NewInternet_4) & is.na(NewInternet_5)  ~ "none",
      NewInternet_4 == "No" & is.na(NewInternet_2) & is.na(NewInternet_3) &
        is.na(NewInternet_1) & is.na(NewInternet_5)  ~ "none",
      TRUE ~ NA_character_
    ),
    internet_dk = case_when(
      NewInternet_1 == "'Don't Know/refused" & is.na(NewInternet_2) & 
        is.na(NewInternet_3) & is.na(NewInternet_4) & is.na(NewInternet_5)  ~ "dk",
      NewInternet_2 == "'Don't Know/refused" & is.na(NewInternet_1) & 
        is.na(NewInternet_3) & is.na(NewInternet_4) & is.na(NewInternet_5)  ~ "dk",
      NewInternet_3 == "'Don't Know/refused" & is.na(NewInternet_2) & 
        is.na(NewInternet_1) & is.na(NewInternet_4) & is.na(NewInternet_5)  ~ "dk",
      NewInternet_5 == "'Don't Know/refused" & is.na(NewInternet_2) & 
        is.na(NewInternet_3) & is.na(NewInternet_1) & is.na(NewInternet_5)  ~ "dk",
      TRUE ~ NA_character_
    )
  ) %>%
  select(-starts_with("NewInternet_")) 

# Rearrange columns 
dat2015_clean <- 
  dat2015 %>% 
  select(respnumber, id, wave, year, month, day, 
         world_region, country, geo1, geo2, geo3, everything())


# Check is 2015 and 2013 colnames match and bind rows
dat_all <- bind_rows(dat2013_clean, dat2015_clean)


# Clean 2017 ---------------------------
# Load data 
dat2017 <- 
  read_dta("input/ADLraw/ADL 2017 Combined File.dta")

# Change to labels
dat2017 <- 
  dat2017 %>% 
  mutate_at(vars(wave, id, yearborn, Age, InternationalTotalWeight),
            as.numeric) %>%
  mutate_at(vars(-wave, -id, -yearborn, -Age, -InternationalTotalWeight), 
            as_character) %>% 
  mutate(respnumber = seq(1:nrow(dat2017))) %>%
  select(-id)

# Country name
dat2017 <- 
  dat2017 %>% 
  mutate(
    country = as_factor(COUNTRY_2017) %>% 
      as.character() %>%
      toupper(),
    US_MUSLIM_SAMPLE = ifelse(country == "USA - MUSLIM", 1, 0),
    country = recode(country, UK = "UNITED KINGDOM", 
                     `USA - MAIN` = "USA",
                     `USA - MUSLIM` = "USA")
  ) %>%
  select(-COUNTRY_2017, -COUNTRY_II)

# Date 
dat2017 <- 
  dat2017 %>%
  mutate(
    date = as.character(Date),
    year = case_when(
      country == "MEXICO" ~ NA_real_,
      TRUE ~ str_sub(Date, start = -4) %>% as.numeric()
    ),
    month = case_when(
      country == "MEXICO" ~ NA_real_,
      TRUE ~ str_sub(Date, end = 1) %>% as.numeric()
    ),
    day = case_when(
      country == "MEXICO" ~ NA_real_,
      TRUE ~ str_sub(Date, start = 2, end = 3) %>% as.numeric()
    )) %>% 
  select(-Date)

# Consolidate ethnicity and language 
dat2017 <- 
  dat2017 %>% 
  select(-ETHNICITY_EURO_2017, -ETHNICITY_USA, -USRACE_2017, 
         -HISPANIC_2017, -US_RACELIST, -ETHNICITY_MEXICO_OTHER,
         -ETHNICITY_USA_STRING, -LANG_FRANCE, -LANG_GERMANY, -LANG_UK) %>%
  unite(ethnicity, c(CombRace_USAONLY, ETHNICTY_MEXICO,
                     ETHNICITY_UK_NEW, ETHNICITY_FRANCE_NEW, 
                     ETHNICITY_GERMANY_NEW), sep = "", na.rm = T) %>% 
  mutate(LANG_USA_STRING = na_if(LANG_USA_STRING, "") %>%
           recode(`1` = "ENGLISH", `2` = "ARABIC")) %>%
  unite(language, starts_with("LANG_"), sep = "", na.rm = T)  

# Consolidate geo
dat2017 <- 
  dat2017 %>%
  select(-GEO_EURO_2017) %>% 
  unite(geo1, c(georecode, GEO_USA, GEO_MEXICO, GEO_UK_NEW,
                  GEO_FRANCE_NEW, GEO_GERMANY_NEW), 
        sep = "", na.rm = T) %>% 
  unite(geo2, c(GEOII_USA, UK_NUTS1, GERMANY_NUTS1, FRANCE_NUTS1), 
        sep = "", na.rm = T) %>%
  unite(geo3, c(US_STATE, UK_NUTS2, GERMANY_NUTS2, FRANCE_NUTS2),
        sep = "", na.rm = T)

# religion
dat2017 <- 
  dat2017 %>%
  select(-US_RELIGIONLIST, -USAMUSLIM_RELIGION, -RELIGION_STRINGOTHER)

# age 
dat2017 <- 
  dat2017 %>% 
  select(-New_Age, -New_Age50, -ED_Age2, -EDAGE_2017)

# phone type
dat2017 <- 
  dat2017 %>% 
  select(-PHONETYPE_RECODE)

# Internet access 
dat2017 <- 
  dat2017 %>% 
  mutate(
    internet_home = case_when(
      NewInternet_1 == "Yes, home" ~ "yes",
      NewInternet_2 == "Yes, home" ~ "yes",
      NewInternet_3 == "Yes, home" ~ "yes",
      TRUE ~ NA_character_
    ),
    internet_phone = case_when(
      NewInternet_1 == "Yes, phone" ~ "yes",
      NewInternet_2 == "Yes, phone" ~ "yes",
      NewInternet_3 == "Yes, phone" ~ "yes",
      TRUE ~ NA_character_
    ),
    internet_job = case_when(
      NewInternet_1 == "Yes, job" ~ "yes",
      NewInternet_2 == "Yes, job" ~ "yes",
      NewInternet_3 == "Yes, job" ~ "yes",
      TRUE ~ NA_character_
    ),
    internet_none = case_when(
      NewInternet_1 == "No" & is.na(NewInternet_2) & is.na(NewInternet_3) &
        is.na(NewInternet_4) & is.na(NewInternet_5)  ~ "none",
      NewInternet_2 == "No" & is.na(NewInternet_1) & is.na(NewInternet_3) &
        is.na(NewInternet_4) & is.na(NewInternet_5)  ~ "none",
      NewInternet_3 == "No" & is.na(NewInternet_2) & is.na(NewInternet_1) &
        is.na(NewInternet_4) & is.na(NewInternet_5)  ~ "none",
      NewInternet_4 == "No" & is.na(NewInternet_2) & is.na(NewInternet_3) &
        is.na(NewInternet_1) & is.na(NewInternet_5)  ~ "none",
      TRUE ~ NA_character_
    ),
    internet_dk = case_when(
      NewInternet_1 == "'Don't Know/refused" & is.na(NewInternet_2) & 
        is.na(NewInternet_3) & is.na(NewInternet_4) & is.na(NewInternet_5)  ~ "dk",
      NewInternet_2 == "'Don't Know/refused" & is.na(NewInternet_1) & 
        is.na(NewInternet_3) & is.na(NewInternet_4) & is.na(NewInternet_5)  ~ "dk",
      NewInternet_3 == "'Don't Know/refused" & is.na(NewInternet_2) & 
        is.na(NewInternet_1) & is.na(NewInternet_4) & is.na(NewInternet_5)  ~ "dk",
      NewInternet_5 == "'Don't Know/refused" & is.na(NewInternet_2) & 
        is.na(NewInternet_3) & is.na(NewInternet_1) & is.na(NewInternet_5)  ~ "dk",
      TRUE ~ NA_character_
    )
  ) %>%
  select(-starts_with("NewInternet_")) 

# Minor renaming
dat2017 <- 
  dat2017 %>% 
  rename(USMUS_TRUMPRACIST = USMIS_TRUMPRACIST)


# Bind data sets -------------------------------------
dat_all <- 
  dat_all %>% 
  bind_rows(dat2017)

# standardize across making lower case 
dat_all_lower <- 
  dat_all %>%
  mutate_if(is.character, tolower)

# take out absurd ages 
dat_all_lower <- 
  dat_all_lower %>%
  mutate(Age = na_if(Age, 2013) %>%
           na_if(2014) %>%
           na_if(2016)) %>%
  rename(age = Age)

# Take out columns that ADL told me were irrelevant 
dat_all_clean <- 
  dat_all_lower %>% 
  select(
    -weight, -InternationalFirstWaveWeight, -InternationalProportion, 
    -First50GlobalShare, -DNU_TFJNEIGHBOR, -DNU_METJEW,
    -q7ascale, -q7bscale, -q7cscale, -q7dscale, -q7escale, -q7fscale,
    -q7gscale, -q7hscale, -q7iscale, -q7jscale, -q7kscale, -q7lscale,
    -Q8ASCALE, -Q8BSCALE, -Q7A_DSCALE, -Q7B_LScale, -Q7_11ptScale, 
    -Q7aSignificant, -Q7aNotSignificant, -Q10Filtered
  )

# change wave information 
dat_all_clean <- 
  dat_all_clean %>% 
  mutate(wave = recode(wave, `2` = 1, `3` = 2, `4` = 3))

# Some more cleaning 
dat_all_clean <- 
  dat_all_clean %>% 
  mutate(
    direction = case_when(
      direction == "right direction" ~ 1L,
      direction == "wrong direction" ~ 0L,
      TRUE ~ NA_integer_
    ),
    economy = case_when(
      economy  == "poor" ~ 1L,
      economy  == "not so good" ~ 2L,
      economy  == "good" ~ 3L,
      economy  == "excellent" ~ 4L,
      TRUE ~ NA_integer_
    ),
    persfinance = case_when(
      persfinance  == "poor" ~ 1L,
      persfinance  == "not so good" ~ 2L,
      persfinance  == "good" ~ 3L,
      persfinance  == "excellent" ~ 4L,
      TRUE ~ NA_integer_
    ),
    polsit = case_when(
      polsit == "very unstable" ~ 1L,
      polsit == "somewhat unstable" ~ 2L,
      polsit == "somewhat stable" ~ 3L,
      polsit == "very stable" ~ 4L,
      TRUE ~ NA_integer_
    ),
    gender = case_when(
      gender == "female" ~ 1L,
      gender == "male" ~ 0L,
      TRUE ~ NA_integer_
    ),
    newssource = case_when(
      newssource %in% c("[vol] don't know/refused", "don't know/refused") ~ NA_character_,
      TRUE ~ newssource
    ) %>% factor(levels = c("other people", "radio", "newspaper",
                            "television", "the internet")),
    yearborn = na_if(yearborn, 0)
  )


write_csv(dat_all_clean, "output/data/adl_full.csv")

# Make country-year-sample size output 
country_year_sample <- 
  dat_all_clean %>%
  select(country, wave) %>%
  group_by_all() %>%
  summarise(n = n()) %>%
  arrange(wave, country)

write_csv(country_year_sample, 
          "output/data/country_year_n.csv")


# Subset and further clean responses for later analysis 
load("input/UN_signatories.RData")
un_countries <- unique(un07, un15)
un_both <- intersect(un07, un15)
un_either <- setdiff(union(un07, un15), un_both)

adl_subset <- 
  dat_all_clean %>% 
  select(wave, world_region, country, ED_Age, religion, 
         year, direction, economy, persfinance, polsit,
         gender, yearborn, newssource,
         jewspop, muslimspop, christianspop, hinduspop, buddhistspop,
         starts_with("tf")) %>% 
  mutate(
    # education to linear categories 
    education = case_when(
      ED_Age %in% c("5-12", "0-12") ~ 1,
      ED_Age == "13-18" ~ 2,
      ED_Age == "19-22" ~ 3,
      ED_Age == "23+" ~ 4,
      TRUE ~ NA_real_
    ),
    education_fct = factor(education, levels = c(1,2,3,4)),
    # clean religion responses
    religion = case_when(
      religion %in% c("[vol]don't know/refused",
                      "don't know/refused") ~ NA_character_,
      TRUE ~ religion
    ),
    signed_un = if_else(country %in% un_countries, 1L, 0L, NA_integer_),
    signed_num = case_when(
      country %in% un_both ~ 2L,
      country %in% un_either ~ 1L,
      TRUE ~ 0L
    ) %>% factor(levels = c(0,1,2))
  ) %>%
  # standardize response categories for stereotypes
  mutate(
    across(starts_with("tf"), recode, 
           "[vol] don't recognize" = "don't know")
  ) %>% 
  # standardize response categories in outgroup sentiment
  mutate(
    across(ends_with("spop"), str_replace, "^\\[vol\\] ", "")
  ) %>% 
  mutate(
    across(ends_with("spop"), na_if, "don't recognize")
  )

adl_subset_new <- 
  dat_all_clean %>% 
  select(wave, world_region, country, ED_Age, religion, 
         year, direction, economy, persfinance, polsit,
         gender, yearborn, newssource,
         jewspop, muslimspop, christianspop, hinduspop, buddhistspop,
         starts_with("tf")) %>% 
  mutate(
    # education to linear categories 
    education = case_when(
      ED_Age %in% c("5-12", "0-12") ~ 1,
      ED_Age == "13-18" ~ 2,
      ED_Age == "19-22" ~ 3,
      ED_Age == "23+" ~ 4,
      TRUE ~ NA_real_
    ),
    education_fct = factor(education, levels = c(1,2,3,4)),
    # clean religion responses
    religion = case_when(
      religion %in% c("[vol]don't know/refused",
                      "don't know/refused") ~ NA_character_,
      TRUE ~ religion
    ),
    signed_un = if_else(country %in% un_countries, 1L, 0L, NA_integer_),
    signed_num = case_when(
      country %in% un_both ~ 2L,
      country %in% un_either ~ 1L,
      TRUE ~ 0L
    ) %>% factor(levels = c(0,1,2))
  ) %>%
  # standardize response categories for stereotypes and outgroup sentiment
  mutate(across(starts_with("tf"), \(x) recode(x, `[vol] don't recognize` = "don't know"))) %>%
  mutate(across(ends_with("spop"), \(x) str_replace(x, "^\\[vol\\] ", ""))) %>%
  mutate(across(ends_with("spop"), \(x) na_if(x, "don't recognize")))


write_csv(adl_subset, "output/data/adl_full_subset.csv")



